post thumbnail

Building an IP Proxy Pool: Crawler Proxy Pool

Construct a reliable proxy pool service to provide efficient proxies for thousands of crawlers, ensuring each crawler receives appropriate proxy IPs for target websites. Implement distributed deep web crawling internally to guarantee speed and reliability. Note: The company’s internal product should not be open-sourced for functionality. However, during personal time, I aim to create a simple proxy pool service using open-source tools.

2022-12-28

How to Resolve IP Blocking?

Code Implementation
The following code extracts proxy IP addresses from the HTML element with class="odd":

from bs4 import BeautifulSoup
import requests
import time

def open_proxy_url(url):
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36'
    headers = {'User-Agent': user_agent}
    try:
        r = requests.get(url, headers=headers, timeout=20)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        print(f'Unable to access webpage: {url}')

def get_proxy_ip(response):
    proxy_ip_list = []
    soup = BeautifulSoup(response, 'html.parser')
    proxy_ips = soup.select('.odd')  # Select elements with class="odd"
    for proxy_ip in proxy_ips:
        ip = proxy_ip.select('td')[1].text
        port = proxy_ip.select('td')[2].text
        protocol = proxy_ip.select('td')[5].text
        if protocol in ('HTTP', 'HTTPS'):
            proxy_ip_list.append(f'{protocol}://{ip}:{port}')
    return proxy_ip_list

if __name__ == '__main__':
    proxy_url = 'https://www.xicidaili.com/'
    text = open_proxy_url(proxy_url)
    proxy_ip_filename = 'proxy_ip.txt'
    with open(proxy_ip_filename, 'w') as f:
        f.write(text)
    text = open(proxy_ip_filename, 'r').read()
    proxy_ip_list = get_proxy_ip(text)
    print(proxy_ip_list)

Issue with Missing Data
Some proxy IPs are not captured because they lack class="odd". Modify the parser to include all <tr> tags under id="ip_list":

def get_proxy_ip(response):
    proxy_ip_list = []
    soup = BeautifulSoup(response, 'html.parser')
    proxy_ips = soup.find(id='ip_list').find_all('tr')
    for proxy_ip in proxy_ips:
        if len(proxy_ip.select('td')) >= 8:
            ip = proxy_ip.select('td')[1].text
            port = proxy_ip.select('td')[2].text
            protocol = proxy_ip.select('td')[5].text.lower()  # Normalize protocol
            if protocol in ('http', 'https'):
                proxy_ip_list.append(f'{protocol}://{ip}:{port}')
    return proxy_ip_list

Using Proxies
Proxies are passed as a dictionary to the requests method:

def open_url_using_proxy(url, proxy):
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36'
    headers = {'User-Agent': user_agent}
    proxies = {}
    if proxy.startswith(('https', 'HTTPS')):
        proxies['https'] = proxy
    else:
        proxies['http'] = proxy
    try:
        r = requests.get(url, headers=headers, proxies=proxies, timeout=10)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return (r.text, r.status_code)
    except:
        print(f'Unable to access webpage: {url}')
        print(f'Invalid proxy IP: {proxy}')
        return False

Proxy Validation
Verify proxy effectiveness by checking status codes and page titles (e.g., Baidu):

def check_proxy_avaliability(proxy):
    url = 'http://www.baidu.com'
    result = open_url_using_proxy(url, proxy)
    if result:
        text, status_code = result
        if status_code == 200:
            soup = BeautifulSoup(text, 'html.parser')
            title = soup.find('title').text
            if title == 'Baidu - Search':
                print(f'Valid proxy IP: {proxy}')
                return True
    print(f'Invalid proxy IP: {proxy}')
    return False

HTTP vs. HTTPS Proxies

  proxies = {
      'http': 'http://10.10.1.10:3128',
      'https': 'https://10.10.1.11:1080'
  }


Use platforms like JSON IP to validate proxies.

References